Data Analysis for Fibrosis paper - Loading and initial processing of stroma
Sample summary
- Our data
- stuff here
Load packages
Make up directory structure
Code
for directoryName in \
output \
output/figures \
output/rdata \
output/de
do
if [ ! -d ${directoryName} ]
then
mkdir -p ${directoryName}
fi
doneLoad functions
Functions to use for cell annotation
Code
#' Annotate cells
#'
#' This function annotates cells based on the input sobject.
#'
#' @param sobject The input sobject.
#'
#' @return A Seurat object with the annotated cells.
#'
#' @examples
#' annot_cells(sobject)
#'
annot_cells <- function(sobject) {
cell_assign <- SingleR::SingleR(
as.SingleCellExperiment(sobject),
ref = list(GetAssayData(mouse_lung_ref), mouse_immune, mouse_rna),
labels = list(
mouse_lung_ref$cell_type,
mouse_immune$label.main,
mouse_rna$label.fine),
aggr.ref = TRUE)
sobject$cell_type <- cell_assign$labels
sobject$cell_score <- cell_assign$scores %>%
apply(MARGIN = 1, function(x) max(x, na.rm = TRUE))
return(sobject)
}
#' Function to generate random colors for unique values in a vector
#'
#' This function takes a vector as input and generates random colors for each unique value in the vector.
#' The function uses the rainbow function to generate a set of colors and assigns a random color to each unique value in the input vector.
#' The seed parameter can be used to set the random seed for reproducibility.
#'
#' @param x A vector of values
#' @param seed An optional seed for the random number generator
#' @return A vector of colors, with one color assigned to each unique value in the input vector
#'
#' @examples
#' crazy_cols(c("A", "B", "C", "A", "B", "D"))
#' crazy_cols(c(1, 2, 3, 1, 2, 4))
#' crazy_cols(c("red", "green", "blue", "red", "green", "yellow"))
#'
crazy_cols <- function(x, seed = 1337) {
set.seed(seed)
sample(rainbow(n = length(unique(x))))
}
#' Calculate log fold change between two groups
#'
#' This function calculates the log fold change between two groups in a Seurat object.
#'
#' @param sobj A Seurat object
#' @param group_var The variable in the Seurat object that defines the groups
#' @param group_1 The name of the first group to compare
#' @param group_2 The name of the second group to compare
#' @param epsilon A small value to add to the denominator to avoid division by zero
#' @param assay The assay to use for the calculation
#'
#' @return A tibble with two columns: "log_fc" (the log fold change) and "gene" (the gene name)
calc_logfc <- function(sobj,
group_var,
group_1,
group_2,
epsilon = 1,
assay = "SCT") {
all_obs_exp <-
AverageExpression(sobj,
group.by = group_var,
assays = assay)[[1]] %>%
as.data.frame()
log_fc <-
tibble(log_fc = log2((all_obs_exp[[group_1]] + epsilon) /
(all_obs_exp[[group_2]] + epsilon)),
gene = rownames(all_obs_exp))
return(log_fc)
}Read in all data and process it for downstream analysis
List of seurat object saved to output/rdata/sobj_list.qs
Get cell type reference datasets
Reference data from GEO#GSE151974 ::: {.cell hash=‘main_cache/html/mouse_refs_d271cd2b0ee593344fd54b1a1c29ec6f’}
Code
ref_path <- "/home/gdrobertslab/lab/GenRef/sc_ref_datasets/mouse"
mouse_lung_ref <- qs::qread(paste0(ref_path, "/GSE151974/mouse_lung_ref.qs"))
mouse_immune <- celldex::ImmGenData()snapshotDate(): 2022-10-31
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
Code
# This label is not informative
mouse_immune <-
mouse_immune[mouse_immune$label.main != "Stromal cells", ]
mouse_rna <- celldex::MouseRNAseqData()snapshotDate(): 2022-10-31
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
:::
Load raw data and suggest cell type
Code
sample_list <- read_tsv("misc/sample_cutoffs.txt", show_col_types = FALSE)
end_path <- "/filtered_feature_bc_matrix"
if (grepl("r1pl", Sys.info()[["nodename"]])) {
data_path <- "/home/gdrobertslab/lab/Counts/"
path_use <- "cluster_path"
} else {
data_path <- "/Applications/scRNA_seq raw files/"
path_use <- "James_path"
}
sobj_list <- parallel::mclapply(seq_len(nrow(sample_list)),
mc.cores = parallelly::availableCores(),
function(i) {
obj_name <- sample_list$obj_name[i]
message(obj_name)
sobj <- tenx_load_qc(
paste0(data_path,sample_list[[path_use]][i], end_path),
min_cells = 3,
min_features = 200,
violin_plot = FALSE)
# Add metadata to dataset
for (colname in colnames(sample_list)) {
sobj[[colname]] <- sample_list[[colname]][i]
}
# Add sample name to dataset
sobj$sample <- obj_name
cutoff_table <-
tribble(~"feature", ~"min_val", ~"max_val",
"nCount_RNA", sample_list$ncount_rna_min[i], sample_list$ncount_rna_max[i],
"percent.mt", 0, sample_list$mt_max[i])
plotted <- feature_hist(
sobj,
features = c("nCount_RNA", "percent.mt"),
cutoff_table = cutoff_table)
ggsave(paste0("output/figures/feature_hist_", obj_name, ".png"),
width = 10,
height = 10,
plot = plotted)
sobj <- sobj %>%
subset(nCount_RNA >= sample_list$ncount_rna_min[i] &
nCount_RNA <= sample_list$ncount_rna_max[i] &
percent.mt <= sample_list$mt_max[i]) %>%
process_seurat()
sobj <- annot_cells(sobj)
return(sobj)
})
names(sobj_list) <- sample_list$obj_name
qs::qsave(sobj_list, file = "output/rdata/sobj_list.qs")Analyze C57BL/6 and F420 data
B6 and F420
Create the merged tumor/normal object
Process into a shared UMAP space ::: {.cell hash=‘main_cache/html/b6_f420_1_f74eadb8f8fb8a67207e861b199e80c8’}
Code
sobj_list <- qs::qread("output/rdata/sobj_list.qs")
b6_f420_combined <- merge(sobj_list[["C57BL6"]],
y = sobj_list[["F420"]],
add.cell.ids = c("C57BL6", "F420")) %>%
SCTransform(verbose = FALSE) %>%
RunPCA(npcs = 30, verbose = FALSE) %>%
RunUMAP(dims = 1:30, seed.use = 22, verbose = FALSE) %>%
FindNeighbors(k.param = 30, reduction = "umap", dims = 1:2, verbose = FALSE) %>%
FindClusters(resolution = 0.3, verbose = FALSE)Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
:::
Inspect identities from SingleR calls and other meta
Code
# Check identities from SingleR calls and other meta
DimPlot(b6_f420_combined, group.by = "seurat_clusters", split.by = "sample", label = TRUE) +
coord_fixed() +
theme(legend.position = "none")Code
DimPlot(b6_f420_combined, group.by = "cell_type", split.by = "sample", label = TRUE) +
coord_fixed() +
theme(legend.position = "none")Code
r_feature_plot(b6_f420_combined, features = "cell_score", split.by = "sample") +
coord_fixed()Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Code
r_feature_plot(b6_f420_combined, features = "nCount_RNA", split.by = "sample") +
coord_fixed()Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Code
r_feature_plot(b6_f420_combined, features = "Col1a1", split.by = "sample") +
coord_fixed()Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Code
r_feature_plot(b6_f420_combined, features = "Col1a2", split.by = "sample") +
coord_fixed()Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Rename clusters based on the calls
Code
if(file.exists("misc/b6_f420_assignments.qs")) {
b6_f420_combined <- AddMetaData(b6_f420_combined,
qs::qread("misc/b6_f420_assignments.qs"))
} else {
# This list is manually curated from the SingleR assignments and scores
# If run again, this needs to be checked for accuracy, as clusters may change
b6_f420_combined <- RenameIdents(b6_f420_combined,
`0` = "Endo",
`1` = "Alv Mac",
`2` = "Mac",
`3` = "Mac",
`4` = "Alv Mac",
`5` = "Mac",
`6` = "Endo",
`7` = "NK/T",
`8` = "Fibroblast",
`9` = "F420",
`10` = "B",
`11` = "Mac",
`12` = "NK/T",
`13` = "Lower Airway",
`14` = "Upper Airway",
`15` = "Mono",
`16` = "F420",
`17` = "Upper Airway",
`18` = "Granulo",
`19` = "DC",
`20` = "DC",
`21` = "Erythro",
`22` = "DC",
`23` = "Peri",
`24` = "Mac",
`25` = "Alv Mac",
`26` = "Adipo",
`27` = "F420",
`28` = "Meso",
`29` = "Endo",
`30` = "SMC",
`31` = "Alv Mac",
`32` = "LowQ",
`33` = "LowQ")
b6_f420_combined$cell_type_final <- Idents(b6_f420_combined)
b6_f420_assignments <- data.frame(cell_type_final = b6_f420_combined$cell_type_final)
rownames(b6_f420_assignments) <- names(b6_f420_combined$cell_type_final)
qs::qsave(b6_f420_assignments, file = "misc/b6_f420_assignments.qs")
}Create publication plots and save
Code
# Remove tumor cells, low quality cells, and erythrocytes
Idents(b6_f420_combined) <- b6_f420_combined$cell_type_final
b6_f420_combined <- subset(b6_f420_combined,
idents = c("LowQ", "Erythro", "F420"),
invert = TRUE)
# Plot and save
r_dim_plot(b6_f420_combined, "F420 Stroma",
group.by = "cell_type_final",
split.by = "sample")Create frequency plots of stromal and immune cells
Analyze BALBC and K7M2 data
BALBC and K7M2
Code
sobj_list <- qs::qread("output/rdata/sobj_list.qs")
# Merge normal and tumor bearing lung for F420 and process into same UMAP space
balb_k7m2_combined <- merge(sobj_list[["BALBC"]],
y = sobj_list[["K7M2"]],
add.cell.ids = c("BALBC", "K7M2")) %>%
SCTransform() %>%
RunPCA(npcs = 30) %>%
RunUMAP(dims = 1:30, seed.use = 111) %>%
FindNeighbors(k.param = 30, reduction = "umap", dims = 1:2) %>%
FindClusters(resolution = 0.3)Calculating cell attributes from input UMI matrix: log_umi
Variance stabilizing transformation of count matrix of size 19480 by 22729
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
There are 1 estimated thetas smaller than 1e-07 - will be set to 1e-07
Found 106 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 19480 genes
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 13%
|
|=========== | 15%
|
|============= | 18%
|
|============== | 21%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 28%
|
|====================== | 31%
|
|======================= | 33%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 79%
|
|========================================================= | 82%
|
|=========================================================== | 85%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
Computing corrected count matrix for 19480 genes
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 13%
|
|=========== | 15%
|
|============= | 18%
|
|============== | 21%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 28%
|
|====================== | 31%
|
|======================= | 33%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 79%
|
|========================================================= | 82%
|
|=========================================================== | 85%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 92%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
Calculating gene attributes
Wall clock passed: Time difference of 1.79706 mins
Determine variable features
Place corrected count matrix in counts slot
Centering data matrix
Set default assay to SCT
PC_ 1
Positive: Lyz2, Cxcl2, Il1b, Apoe, Ccl6, C1qa, Arg1, Spp1, Ftl1, Cd74
C1qb, Cd14, Wfdc17, Ifi27l2a, Cybb, Ccl9, C1qc, Fth1, Ctss, Ccl4
Ctsd, Fcer1g, Tyrobp, H2-Aa, Fn1, H2-Eb1, Pf4, Chil3, Cxcl1, Ctsl
Negative: Ramp2, Cldn5, Egfl7, Calcrl, Epas1, Cdh5, Ctla2a, Tmem100, Hpgd, Clec14a
Ly6a, Adgrf5, Igfbp7, Gpihbp1, Emp2, Tspan7, Ptprb, Ly6c1, Icam2, Scn7a
Kdr, Pecam1, Fmo1, Stmn2, Plvap, Eng, Cyp4b1, Ace, Slc9a3r2, Acvrl1
PC_ 2
Positive: Wfdc2, Sftpb, Sftpa1, Sftpd, Sftpc, Cldn3, Cxcl15, Cbr2, Slc34a2, Chil1
Krt8, Cldn18, Napsa, Scd1, Krt18, Ager, Lcn2, Atp1b1, Lamp3, Chchd10
S100a4, Muc1, Ces1d, Hc, Ankrd1, Sec14l3, S100a6, Ptprf, S100g, Col1a2
Negative: Cxcl2, Apoe, Il1b, Arg1, C1qa, Ccl6, C1qb, Wfdc17, Ctsl, Spp1
Pf4, Cd14, C1qc, Ftl1, Ccl9, Ccl4, Fn1, Lyz2, Il1a, Ramp2
Cldn5, Ctsd, Ier3, Ctsb, Fcer1g, Ctss, Ccrl2, Ifi27l2a, Srgn, Ccl24
PC_ 3
Positive: Wfdc2, Cbr2, Apoe, Cldn3, Sftpb, Lyz2, Sftpa1, Sftpc, Sftpd, Cxcl15
Krt8, Slc34a2, Cxcl2, Arg1, Chil1, C1qa, Krt18, Lcn2, Cldn18, C1qb
Napsa, Chchd10, Ager, Sec14l3, Il1b, Pf4, Lamp3, Wfdc17, Ctsl, C1qc
Negative: S100a4, Ankrd1, Cd3g, S100a6, Trbc2, Ms4a4b, Trbc1, Crabp2, Il1rl1, Nkg7
Cd3d, Col1a2, Rpl39l, Lgals1, Rps24, Ccr7, Igkc, Ccl5, Icos, Satb1
Tnfrsf18, Ikzf2, Tnfrsf4, Vps37b, Rpl12, Cd79a, Ptprcap, Rpl13, Rps27, Il2rb
PC_ 4
Positive: S100a6, Apoe, S100a4, Ccl2, C1qa, Lgals1, Col1a2, Arg1, Ankrd1, C1qb
Ifitm3, Ifi27l2a, Ccl7, Bgn, C1qc, Cxcl1, Sparc, Pf4, Ctsl, Fn1
Crabp2, Il1rl1, Mif, Plac8, Cald1, Ier3, Mgp, Tpm1, Col1a1, Rpl39l
Negative: Lpl, Chil3, Plet1, Cybb, Lyz2, Ear2, Fabp1, Abcg1, Ear1, Atp6v0d2
Sirpa, Ccl6, Il18, Krt79, Ctsk, Laptm5, Ctsd, Mrc1, Wfdc21, Cd9
Slc7a2, Slpi, Gdf15, Ccnd2, Nceh1, Tlr2, Gm42418, Sgk1, Ftl1, Car4
PC_ 5
Positive: Cd74, H2-Aa, H2-Eb1, H2-Ab1, Napsa, Sftpc, Sftpa1, Sftpb, Sftpd, Cxcl15
Slc34a2, Chil1, Lcn2, Scd1, Lamp3, Cd52, Ager, Cst3, Cldn18, Hc
Muc1, Wfdc2, Igkc, S100g, Lpcat1, Ccr7, Cd79a, Atp1b1, Ctsh, Sfta2
Negative: S100a6, Col1a2, Lpl, S100a4, Ankrd1, Ccl6, Chil3, Bgn, Cybb, Crabp2
Plet1, Cxcl1, Lgals1, Aldh1a1, Il1rl1, Rarres2, Ftl1, Cald1, Mgp, Ctsd
Col1a1, Rpl39l, Sparc, Ear2, Gsn, Mt1, Timp3, Ctgf, Fam183b, Col3a1
09:51:08 UMAP embedding parameters a = 0.9922 b = 1.112
09:51:08 Read 22729 rows and found 30 numeric columns
09:51:08 Using Annoy for neighbor search, n_neighbors = 30
09:51:08 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
09:51:12 Writing NN index file to temp file /gpfs0/scratch/4151538/RtmpDRDkfh/file22d8348c23061
09:51:12 Searching Annoy index using 1 thread, search_k = 3000
09:51:21 Annoy recall = 100%
09:51:21 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
09:51:23 Found 2 connected components, falling back to 'spca' initialization with init_sdev = 1
09:51:23 Using 'irlba' for PCA
09:51:23 PCA: 2 components explained 30.11% variance
09:51:23 Scaling init to sdev = 1
09:51:23 Commencing optimization for 200 epochs, with 961250 positive edges
09:51:33 Optimization finished
Computing nearest neighbor graph
Computing SNN
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 22729
Number of edges: 762053
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9775
Number of communities: 34
Elapsed time: 1 seconds
Code
# Check identities from SingleR calls and other meta
DimPlot(balb_k7m2_combined, group.by = "seurat_clusters", split.by = "sample", label = TRUE) +
coord_fixed() +
theme(legend.position = "none")Code
DimPlot(balb_k7m2_combined, group.by = "cell_type", split.by = "sample", label = TRUE) +
coord_fixed() +
theme(legend.position = "none")Code
r_feature_plot(balb_k7m2_combined, features = "cell_score", split.by = "sample") +
coord_fixed()Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Code
r_feature_plot(balb_k7m2_combined, features = "nCount_RNA", split.by = "sample") +
coord_fixed()Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Code
r_feature_plot(balb_k7m2_combined, features = "Col1a1", split.by = "sample") +
coord_fixed()Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Code
r_feature_plot(balb_k7m2_combined, features = "Col1a2", split.by = "sample") +
coord_fixed()Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
Code
# Rename clusters based on the calls
if(file.exists("misc/balb_k7m2_assignments.qs")) {
balb_k7m2_combined <- AddMetaData(balb_k7m2_combined,
qs::qread("misc/balb_k7m2_assignments.qs"))
} else {
# This list is manually curated from the SingleR assignments and scores
# If run again, this needs to be checked for accuracy, as clusters may change
balb_k7m2_combined <- RenameIdents(balb_k7m2_combined,
`0` = "Alv Mac",
`1` = "Mac",
`2` = "Mac",
`3` = "Alv Mac",
`4` = "Endo",
`5` = "DC",
`6` = "B",
`7` = "NK/T",
`8` = "K7M2",
`9` = "Lower Airway",
`10` = "NK/T",
`11` = "K7M2",
`12` = "Granulo",
`13` = "Mono",
`14` = "Endo",
`15` = "Endo",
`16` = "NK/T",
`17` = "NK/T",
`18` = "Granulo",
`19` = "NK/T",
`20` = "Mac",
`21` = "Fibroblast",
`22` = "SMC",
`23` = "Upper Airway",
`24` = "Lower Airway",
`25` = "Alv Mac",
`26` = "Endo",
`27` = "Erythro",
`28` = "Adipo",
`29` = "Alv Mac",
`30` = "LowQ",
`31` = "DC",
`32` = "LowQ",
`33` = "LowQ",
`34` = "Peri")
balb_k7m2_combined$cell_type_final <- Idents(balb_k7m2_combined)
balb_k7m2_assignments <- data.frame(cell_type_final = balb_k7m2_combined$cell_type_final)
rownames(balb_k7m2_assignments) <- names(balb_k7m2_combined$cell_type_final)
qs::qsave(balb_k7m2_assignments, file = "misc/balb_k7m2_assignments.qs")
}
# Remove tumor cells, low quality cells, and erythrocytes
Idents(balb_k7m2_combined) <- balb_k7m2_combined$cell_type_final
balb_k7m2_combined <- subset(balb_k7m2_combined,
idents = c("LowQ", "Erythro", "K7M2"),
invert = TRUE)
# Plot and save
r_dim_plot(balb_k7m2_combined, "K7M2 Stroma",
group.by = "cell_type_final",
split.by = "sample")Code
ggsave("output/figures/combined_balb_k7m2.pdf",
width = 8,
height = 6)
qs::qsave(balb_k7m2_combined, "output/rdata/balb_k7m2_combined.qs")
# Create frequency plots of stromal and immune cells
subset(balb_k7m2_combined,
idents = c("Mono", "Mac", "DC", "NK/T", "B", "Granulo", "Alv Mac"))@meta.data %>%
ggplot(aes(sample, fill = cell_type_final)) +
geom_bar(position = "fill") +
theme_classic()Code
Code
ggsave("output/figures/balb_stroma.pdf",
width = 4,
height = 4)Analyze macrophage populations within the tumor samples
This includes both metastatic and primary tumor samples Merged object saved to output/rdata/all_sobj.qs The subset macrophages from lung mets and primary tumors are saved to output/rdata/immune_recurl.qs
Make a Seurat object of all the data
Code
sobj_list <- qs::qread("output/rdata/sobj_list.qs")
all_sobj <-
merge(sobj_list[[1]],
sobj_list[2:length(sobj_list)]) %>%
SCTransform(vars.to.regress = "percent.mt",
verbose = FALSE,
return.only.var.genes = FALSE) %>%
RunPCA(verbose = FALSE, assay = "SCT") %>%
FindNeighbors(verbose = FALSE) %>%
FindClusters(verbose = FALSE) %>%
RunUMAP(verbose = FALSE, dims = 1:30)Warning in CheckDuplicateCellNames(object.list = objects): Some cell names are
duplicated across objects provided. Renaming to enforce unique cell names.
Grab just macrophages and recursively cluster them
Code
mac_types <-
c("Int Mf",
"Macrophages",
"Macrophages activated")
Idents(all_sobj) <- "cell_type"
# This brings in the recurluster function that recursively clusters the cells
# using silhouette scoring
source("scripts/recurl.R")
immune_recurl <-
subset(all_sobj,
idents = mac_types) %>%
recurluster(parallel = TRUE,
do_plots = FALSE)Clustering level 1
Optimal resolution: 0.2
Code
immune_recurl <-
immune_recurl %>%
SCTransform(vars.to.regress = "percent.mt",
verbose = FALSE,
return.only.var.genes = FALSE) %>%
RunPCA(verbose = FALSE, assay = "SCT") %>%
FindNeighbors(verbose = FALSE) %>%
FindClusters(verbose = FALSE) %>%
RunUMAP(verbose = FALSE, dims = 1:30)
qs::qsave(immune_recurl,
file = "output/rdata/immune_recurl.qs")Plot the macrophages
Code
macs_dim <-
DimPlot(immune_recurl,
group.by = c("obj_name", "clust_1", "clust_2"),
label = TRUE,
repel = TRUE,
cols = c(plot_cols, crazy_cols(immune_recurl$clust_2, seed = 1341)))
ggsave("output/figures/mac_dim2.pdf",
width = 25,
height = 8)
cluster_counts <-
table(immune_recurl$obj_name, immune_recurl$clust_2) %>%
as.data.frame() %>%
dplyr::rename("Sample" = "Var1",
"Cluster" = "Var2",
"Count" = "Freq") %>%
group_by(Cluster) %>%
arrange(desc(Count), .by_group = TRUE) %>%
ggplot(aes(y = Cluster, x = Sample, fill = log10(Count))) +
geom_tile() +
geom_text(aes(label = Count), color = "white")
ggsave("output/figures/mac_cluster_counts2.pdf",
width = 8,
height = 8)
Idents(immune_recurl) <- "clust_2"Look at inflammatory genes in macs
Code
inflam_genes <-
c("H2-Eb1", "H2-Ab1", "Cd86", "Il1a", "Il1b", "Cxcl1", "Cxcl2")
anti_inflam_genes <-
c("Gpnmb", "Fabp5", "Anpep", "C1qa", "C1qb", "Fcrls", "Trem2", "Fn1", "Spp1")
infl_macs <-
FeaturePlot(immune_recurl,
features = inflam_genes)
ggsave("output/figures/mac_inflam.pdf",
width = 8,
height = 8,
plot = infl_macs)
infl_macs_vln <-
VlnPlot(immune_recurl,
features = inflam_genes,
group.by = "clust_2",
cols = c(plot_cols, crazy_cols(immune_recurl$clust_2)),
pt.size = 0,
adjust = 0.5)
ggsave("output/figures/mac_inflam_vln.pdf",
width = 15,
height = 8,
plot = infl_macs_vln)
anti_imm_macs <-
FeaturePlot(immune_recurl,
features = anti_inflam_genes)
ggsave("output/figures/mac_anti_inflam.pdf",
width = 8,
height = 8,
plot = anti_imm_macs)
anti_imm_macs_vln <-
VlnPlot(immune_recurl,
features = anti_inflam_genes,
group.by = "clust_2",
cols = c(plot_cols, crazy_cols(immune_recurl$clust_2)),
pt.size = 0,
adjust = 0.5)
ggsave("output/figures/mac_anti_inflam_vln.pdf",
width = 15,
height = 8,
plot = anti_imm_macs_vln)
immune_recurl <-
AddModuleScore(immune_recurl,
features = list(inflam_genes),
name = "inflam_module")
immune_recurl <-
AddModuleScore(immune_recurl,
features = list(anti_inflam_genes),
name = "anti_inflam_module")
qs::qsave(immune_recurl,
file = "output/rdata/immune_recurl.qs")
imm_mod_vln <-
VlnPlot(immune_recurl,
features = c("inflam_module1", "anti_inflam_module1"),
group.by = "clust_2",
ncol = 1,
cols = c(plot_cols, crazy_cols(immune_recurl$clust_2)),
pt.size = 0,
adjust = 0.5)
ggsave("output/figures/mac_inflam_mod_vln.pdf",
width = 12,
height = 8,
plot = imm_mod_vln)Run GSEA on the macrophage populations
Define pathway categories
Code
cat_tib <-
tribble(~category, ~subcat, ~label,
"C2", "CP:KEGG", "KEGG",
"C3", "TFT:GTRD", "Transcription factors",
"C5", "GO:BP", "GO Biological Process",
"C5", "GO:MF", "GO Molecular Function",
"C8", "", "Cell type")Run GSEA on all macs between primary and met
Code
immune_recurl <- qs::qread("output/rdata/immune_recurl.qs")
tumor_only <-
subset(immune_recurl,
obj_name %in% c("K7M2", "F420", "tibia_F420", "tibia_K7M2"))
tumor_only$comp <-
if_else(tumor_only$obj_name == "tibia_F420" |
tumor_only$obj_name == "tibia_K7M2",
"Primary",
"Met")
qs::qsave(tumor_only,
file = "output/rdata/macs_tumor_only.qs")Calculate logfc using same method as Seurat
This is much faster than FindMarkers() and gives the same results ::: {.cell hash=‘main_cache/html/gsea_met_primary_ffd84f3cf70269cee1c2b667d0db0741’}
Code
prim_met_logfc <-
calc_logfc(tumor_only,
group_var = "comp",
group_1 = "Primary",
group_2 = "Met") %>%
arrange(desc(log_fc)) %>%
pull(log_fc, name = gene)
gsea_out <-
parallel::mclapply(seq_len(nrow(cat_tib)),
mc.cores = 5,
function(i) {
message(cat_tib$label[i])
kegg_ref <-
msigdbr::msigdbr(species = "Mus musculus",
category = cat_tib$category[i],
subcategory = cat_tib$subcat[i]) %>%
split(x = .$gene_symbol, f = .$gs_name)
fgsea::fgseaMultilevel(kegg_ref,
prim_met_logfc,
minSize = 15,
maxSize = 500,
nPerm = 1000) %>%
arrange(desc(NES)) %>%
mutate(pathway = str_replace_all(pathway, "_", " "),
category = cat_tib$category[i],
sub_cat = cat_tib$subcat[i],
cat_label = cat_tib$label[i]) %>%
filter(padj < 0.05)
}) %>%
bind_rows():::
Plot GSEA output
Code
top_paths <-
gsea_out %>%
group_by(cat_label, NES > 0) %>%
arrange(desc(abs(NES)), .by_group = TRUE) %>%
slice_head(n = 10) %>%
pull(pathway) %>%
str_wrap(width = 80)
gsea_dots <-
gsea_out %>%
filter(pathway %in% top_paths) %>%
mutate(pathway = fct_reorder(pathway, NES)) %>%
ggplot(aes(y = pathway,
x = NES,
size = -log10(padj))) +
geom_point() +
theme(axis.text.y = element_text(size = 4))
ggsave("output/figures/gsea_dots_macs_prim_met.pdf",
width = 8,
height = 15,
plot = gsea_dots)GSEA comparing met to primary for each cluster
Calculate logfc between met and primary for each cluster
Code
min_cell_comp <- 10
all_logfc <-
sapply(unique(tumor_only$clust_2),
USE.NAMES = TRUE,
function(x) {
test_data <-
subset(tumor_only,
clust_2 == x)
if (min(table(test_data$comp)) > min_cell_comp &&
length(unique(test_data$comp)) == 2) {
output <-
calc_logfc(test_data,
group_var = "comp",
group_1 = "Primary",
group_2 = "Met") %>%
arrange(desc(log_fc)) %>%
pull(log_fc, name = gene)
} else {
output <- NULL
}
return(output)
})
all_logfc <- all_logfc[!all_logfc %in% list(NULL)]Run the GSEA
Code
set.seed(1337)
gsea_out <-
parallel::mclapply(seq_len(nrow(cat_tib)),
mc.cores = 5,
function(i) {
message(cat_tib$label[i])
kegg_ref <-
msigdbr::msigdbr(species = "Mus musculus",
category = cat_tib$category[i],
subcategory = cat_tib$subcat[i]) %>%
split(x = .$gene_symbol, f = .$gs_name)
parallel::mclapply(names(all_logfc),
mc.cores = 5,
function(x) {
fgsea::fgseaMultilevel(kegg_ref,
all_logfc[[x]],
minSize = 15,
maxSize = 500,
nPerm = 1000) %>%
arrange(desc(NES)) %>%
mutate(pathway = str_replace_all(pathway, "_", " "),
cluster = x,
category = cat_tib$category[i],
sub_cat = cat_tib$subcat[i],
cat_label = cat_tib$label[i]) %>%
filter(padj < 0.05)
}) %>%
bind_rows()
}) %>%
bind_rows()Plot met-primary GSEA output
Code
lapply(unique(gsea_out$cat_label),
function(x) {
top_paths <-
gsea_out %>%
filter(cat_label == x) %>%
group_by(cluster) %>%
arrange(desc(abs(NES)), .by_group = TRUE) %>%
slice_head(n = 20) %>%
pull(pathway) %>%
str_wrap(width = 80)
gsea_out %>%
filter(pathway %in% top_paths) %>%
pivot_wider(names_from = pathway,
values_from = NES,
id_cols = c(cluster),
values_fill = 0) %>%
column_to_rownames("cluster") %>%
t() %>%
pheatmap::pheatmap(main = x,
filename = paste0("output/figures/gsea_",
x,
".pdf"),
width = 8,
height = 12,
fontsize_row = 6)
})[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
Confirm directionality of genes from GSEA output
Make plot to make sure I have the directionality of the DE genes correct ::: {.cell hash=‘main_cache/html/kegg_ribo_b23661e7e7a0f9087bd26bba95315750’}
Code
Warning: The following features are not present in the object: Rpl10l, not
searching for symbol synonyms
Code
The default behaviour of split.by has changed.
Separate violin plots are now plotted side-by-side.
To restore the old behaviour of a single split violin,
set split.plot = TRUE.
This message will be shown once per session.
Code
FeaturePlot(immune_recurl,
features = kegg_ribo_genes[[1]])Warning in FetchData.Seurat(object = object, vars = c(dims, "ident", features),
: The following requested variables were not found: Rpl10l
Code
Warning in FetchData.Seurat(object = object, vars = features, cells = cells):
The following requested variables were not found: Rpl10l
:::
Run GSEA to compare each cluster to all others
Calculate logfc between each cluster and all others
Code
all_logfc_clust <-
sapply(unique(tumor_only$clust_2),
USE.NAMES = TRUE,
function(x) {
test_data <- tumor_only
test_data$comp <-
if_else(test_data$clust_2 == x,
x,
"other")
output <-
calc_logfc(test_data,
group_var = "comp",
group_1 = x,
group_2 = "other") %>%
arrange(desc(log_fc)) %>%
pull(log_fc, name = gene)
return(output)
})Run the GSEA
Code
set.seed(1337)
gsea_out_clust <-
parallel::mclapply(seq_len(nrow(cat_tib)),
mc.cores = 5,
function(i) {
message(cat_tib$label[i])
kegg_ref <-
msigdbr::msigdbr(species = "Mus musculus",
category = cat_tib$category[i],
subcategory = cat_tib$subcat[i]) %>%
split(x = .$gene_symbol, f = .$gs_name)
parallel::mclapply(names(all_logfc),
mc.cores = 5,
function(x) {
fgsea::fgseaMultilevel(kegg_ref,
all_logfc[[x]],
minSize = 15,
maxSize = 500,
nPerm = 1000) %>%
arrange(desc(NES)) %>%
mutate(pathway = str_replace_all(pathway, "_", " "),
cluster = x,
category = cat_tib$category[i],
sub_cat = cat_tib$subcat[i],
cat_label = cat_tib$label[i]) %>%
filter(padj < 0.05)
}) %>%
bind_rows()
}) %>%
bind_rows()Plot GSEA output
Code
lapply(unique(gsea_out_clust$cat_label),
function(x) {
top_paths <-
gsea_out_clust %>%
filter(cat_label == x) %>%
group_by(cluster) %>%
arrange(desc(abs(NES)), .by_group = TRUE) %>%
slice_head(n = 20) %>%
pull(pathway) %>%
str_wrap(width = 80)
gsea_out_clust %>%
filter(pathway %in% top_paths) %>%
pivot_wider(names_from = pathway,
values_from = NES,
id_cols = c(cluster),
values_fill = 0) %>%
column_to_rownames("cluster") %>%
t() %>%
pheatmap::pheatmap(main = x,
filename = paste0("output/figures/gsea_by_cluster_",
x,
".pdf"),
width = 8,
height = 12,
fontsize_row = 6)
})[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
Code
R version 4.2.2 (2022-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux 8.6 (Ootpa)
Matrix products: default
BLAS: /usr/lib64/libblis.so.2.1.0
LAPACK: /gpfs0/export/apps/opt/R/4.2.2-foss-2020a/lib64/R/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] celldex_1.8.0 SummarizedExperiment_1.28.0
[3] Biobase_2.58.0 GenomicRanges_1.50.2
[5] GenomeInfoDb_1.34.9 IRanges_2.32.0
[7] S4Vectors_0.36.2 BiocGenerics_0.44.0
[9] MatrixGenerics_1.10.0 matrixStats_0.63.0
[11] msigdbr_7.5.1 rrrSingleCellUtils_0.9.0
[13] nichenetr_1.1.1 sctransform_0.3.5
[15] SeuratObject_4.1.3 Seurat_4.3.0
[17] reticulate_1.28 patchwork_1.1.2
[19] ggrepel_0.9.3 lubridate_1.9.2
[21] forcats_1.0.0 stringr_1.5.0
[23] dplyr_1.1.0 purrr_1.0.1
[25] readr_2.1.4 tidyr_1.3.0
[27] tibble_3.2.0 ggplot2_3.4.1
[29] tidyverse_2.0.0 cluster_2.1.4
[31] cowplot_1.1.1
loaded via a namespace (and not attached):
[1] rappdirs_0.3.3 scattermore_0.8
[3] ModelMetrics_1.2.2.2 bit64_4.0.5
[5] knitr_1.44 irlba_2.3.5.1
[7] DelayedArray_0.24.0 data.table_1.14.8
[9] rpart_4.1.19 KEGGREST_1.38.0
[11] hardhat_1.2.0 RCurl_1.98-1.10
[13] doParallel_1.0.17 generics_0.1.3
[15] RSQLite_2.3.0 RANN_2.6.1
[17] proxy_0.4-27 future_1.32.0
[19] DiagrammeR_1.0.9 bit_4.0.5
[21] tzdb_0.3.0 spatstat.data_3.0-1
[23] httpuv_1.6.9 gower_1.0.1
[25] xfun_0.39 hms_1.1.2
[27] babelgene_22.9 evaluate_0.20
[29] promises_1.2.0.1 fansi_1.0.4
[31] caTools_1.18.2 dbplyr_2.3.1
[33] igraph_1.4.1 DBI_1.1.3
[35] htmlwidgets_1.6.1 spatstat.geom_3.1-0
[37] ellipsis_0.3.2 ggpubr_0.6.0
[39] backports_1.4.1 sparseMatrixStats_1.10.0
[41] deldir_1.0-6 vctrs_0.5.2
[43] ROCR_1.0-11 abind_1.4-5
[45] caret_6.0-93 cachem_1.0.7
[47] withr_2.5.0 progressr_0.13.0
[49] checkmate_2.1.0 fdrtool_1.2.17
[51] goftest_1.2-3 ExperimentHub_2.6.0
[53] lazyeval_0.2.2 crayon_1.5.2
[55] spatstat.explore_3.1-0 recipes_1.0.5
[57] pkgconfig_2.0.3 nlme_3.1-160
[59] nnet_7.3-18 rlang_1.1.0
[61] globals_0.16.2 lifecycle_1.0.3
[63] miniUI_0.1.1.1 filelock_1.0.2
[65] extrafontdb_1.0 BiocFileCache_2.6.1
[67] AnnotationHub_3.6.0 randomForest_4.7-1.1
[69] polyclip_1.10-4 lmtest_0.9-40
[71] Matrix_1.5-1 carData_3.0-5
[73] zoo_1.8-11 base64enc_0.1-3
[75] pheatmap_1.0.12 ggridges_0.5.4
[77] GlobalOptions_0.1.2 png_0.1-8
[79] viridisLite_0.4.1 rjson_0.2.21
[81] bitops_1.0-7 KernSmooth_2.23-20
[83] visNetwork_2.1.2 pROC_1.18.0
[85] Biostrings_2.66.0 DelayedMatrixStats_1.20.0
[87] blob_1.2.3 shape_1.4.6
[89] parallelly_1.34.0 spatstat.random_3.1-4
[91] rstatix_0.7.2 ggsignif_0.6.4
[93] scales_1.2.1 memoise_2.0.1
[95] magrittr_2.0.3 plyr_1.8.8
[97] ica_1.0-3 zlibbioc_1.44.0
[99] compiler_4.2.2 RColorBrewer_1.1-3
[101] clue_0.3-64 fitdistrplus_1.1-8
[103] cli_3.6.0 XVector_0.38.0
[105] listenv_0.9.0 pbapply_1.7-0
[107] htmlTable_2.4.1 Formula_1.2-5
[109] MASS_7.3-58.1 tidyselect_1.2.0
[111] stringi_1.7.12 yaml_2.3.7
[113] grid_4.2.2 tools_4.2.2
[115] timechange_0.2.0 future.apply_1.10.0
[117] parallel_4.2.2 circlize_0.4.15
[119] rstudioapi_0.14 foreach_1.5.2
[121] foreign_0.8-83 gridExtra_2.3
[123] prodlim_2019.11.13 Rtsne_0.16
[125] digest_0.6.31 BiocManager_1.30.20
[127] shiny_1.7.4 lava_1.7.2.1
[129] Rcpp_1.0.10 car_3.1-1
[131] broom_1.0.4 BiocVersion_3.16.0
[133] later_1.3.0 RcppAnnoy_0.0.20
[135] httr_1.4.5 AnnotationDbi_1.60.2
[137] ComplexHeatmap_2.14.0 colorspace_2.1-0
[139] tensor_1.5 splines_4.2.2
[141] uwot_0.1.14 spatstat.utils_3.0-2
[143] sp_1.6-0 plotly_4.10.1
[145] xtable_1.8-4 jsonlite_1.8.4
[147] timeDate_4022.108 ipred_0.9-14
[149] R6_2.5.1 Hmisc_5.0-1
[151] pillar_1.8.1 htmltools_0.5.4
[153] mime_0.12 glue_1.6.2
[155] fastmap_1.1.1 class_7.3-20
[157] interactiveDisplayBase_1.36.0 codetools_0.2-18
[159] utf8_1.2.3 lattice_0.20-45
[161] spatstat.sparse_3.0-1 curl_5.0.0
[163] leiden_0.4.3 Rttf2pt1_1.3.12
[165] survival_3.4-0 limma_3.54.2
[167] rmarkdown_2.20 munsell_0.5.0
[169] e1071_1.7-13 GetoptLong_1.0.5
[171] GenomeInfoDbData_1.2.9 iterators_1.0.14
[173] reshape2_1.4.4 gtable_0.3.1
[175] extrafont_0.19